# plotly standard imports
import plotly.graph_objs as go
import chart_studio.plotly as py
# Cufflinks wrapper on plotly
import cufflinks
# Data science imports
import pandas as pd
import numpy as np
# Options for pandas
pd.options.display.max_columns = 30
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot, init_notebook_mode
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
# Set global theme
cufflinks.set_config_file(world_readable=True, theme='pearl')
Will use target dataset Bitcoin Historical Data
Bitcoin data at 1-min intervals from select exchanges, Jan 2012 to Dec 2020
from src.load_datasets import load_input_dataset
input_dataset = load_input_dataset()
input_dataset.head()
| Timestamp | Open | High | Low | Close | Volume_(BTC) | Volume_(Currency) | Weighted_Price | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1325317920 | 4.39 | 4.39 | 4.39 | 4.39 | 0.455581 | 2.0 | 4.39 |
| 1 | 1325317980 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 1325318040 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 1325318100 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 1325318160 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Will explore full input dataset, some values contain NaN, which not ineraptebale by sweetviz, so will use timestamp as target feature for now
import sweetviz as sv
analyse_report = sv.analyze([input_dataset, 'Input'], target_feat="Timestamp")
analyse_report.show_notebook()
Will take one timestamp per hour for faster interpretation
input_dataset = input_dataset[59::60]
raw_timestamps = input_dataset.pop('Timestamp')
timestamp need interprate as date for charts processing
input_datetime = pd.to_datetime(raw_timestamps, unit='s')
input_dataset.head()
| Open | High | Low | Close | Volume_(BTC) | Volume_(Currency) | Weighted_Price | |
|---|---|---|---|---|---|---|---|
| 59 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 119 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 179 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 239 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 299 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Feature evalution over time
input_features = input_dataset[['Open', 'Close', 'Weighted_Price']]
input_features.index = input_datetime
input_features.iplot(
subplots=True,
)
input_dataset.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Open | 57539.0 | 4597.869997 | 4781.860481 | 4.230000 | 436.000000 | 3308.83000 | 8178.790000 | 2.892734e+04 |
| High | 57539.0 | 4600.855412 | 4785.273937 | 4.230000 | 436.335000 | 3312.82000 | 8183.195000 | 2.897090e+04 |
| Low | 57539.0 | 4594.739766 | 4778.247471 | 4.230000 | 435.900000 | 3304.00000 | 8173.460000 | 2.892734e+04 |
| Close | 57539.0 | 4597.868472 | 4781.911473 | 4.230000 | 436.025000 | 3305.00000 | 8177.300000 | 2.897090e+04 |
| Volume_(BTC) | 57539.0 | 9.003106 | 29.133884 | 0.000001 | 0.370362 | 1.85132 | 6.922594 | 1.540919e+03 |
| Volume_(Currency) | 57539.0 | 31184.490106 | 105810.113064 | 0.000178 | 411.516416 | 3109.00990 | 20379.086506 | 4.865723e+06 |
| Weighted_Price | 57539.0 | 4597.865954 | 4781.891626 | 4.230000 | 436.030107 | 3305.86258 | 8177.593176 | 2.895104e+04 |
Will take only last three yers, because they have data without missing values
day = 24
year = (365)*day
input_dataset = input_dataset.tail(3 * year)
input_datetime = input_datetime.tail(3 * year)
input_dataset.head()
len(input_datetime)
| Open | High | Low | Close | Volume_(BTC) | Volume_(Currency) | Weighted_Price | |
|---|---|---|---|---|---|---|---|
| 3151019 | 13633.25 | 13670.99 | 13631.95 | 13631.95 | 0.871489 | 11882.568302 | 13634.789405 |
| 3151079 | 13460.87 | 13489.55 | 13460.60 | 13489.55 | 8.170318 | 110043.490480 | 13468.690911 |
| 3151139 | 13320.00 | 13350.00 | 13303.09 | 13350.00 | 10.190223 | 135747.508220 | 13321.347592 |
| 3151199 | 13540.81 | 13611.64 | 13527.87 | 13527.87 | 11.510917 | 156584.137430 | 13603.098257 |
| 3151259 | 13616.00 | 13640.59 | 13606.68 | 13640.59 | 3.617158 | 49269.986162 | 13621.187176 |
0
input_features = input_dataset[['Open', 'Close', 'Weighted_Price']]
input_features.index = input_datetime
input_features.iplot(
subplots=True,
)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-19-7bbc70516061> in <module> 1 input_features = input_dataset[['Open', 'Close', 'Weighted_Price']] ----> 2 input_features.index = input_datetime 3 4 input_features.iplot( 5 subplots=True, /usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in __setattr__(self, name, value) 5152 try: 5153 object.__getattribute__(self, name) -> 5154 return object.__setattr__(self, name, value) 5155 except AttributeError: 5156 pass pandas/_libs/properties.pyx in pandas._libs.properties.AxisProperty.__set__() /usr/local/lib/python3.6/dist-packages/pandas/core/generic.py in _set_axis(self, axis, labels) 562 def _set_axis(self, axis: int, labels: Index) -> None: 563 labels = ensure_index(labels) --> 564 self._mgr.set_axis(axis, labels) 565 self._clear_item_cache() 566 /usr/local/lib/python3.6/dist-packages/pandas/core/internals/managers.py in set_axis(self, axis, new_labels) 225 if new_len != old_len: 226 raise ValueError( --> 227 f"Length mismatch: Expected axis has {old_len} elements, new " 228 f"values have {new_len} elements" 229 ) ValueError: Length mismatch: Expected axis has 26280 elements, new values have 0 elements
train_df = pd.DataFrame(tfds.as_numpy(train_data), columns=['text', 'type'])
train_df['type'] = train_df['type'].apply(humanize_label)
train_df.head()
N/A% (0 of 1600) | | Elapsed Time: 0:00:00 ETA: --:--:--
Start reading dataset from ./data/training.1600000.processed.noemoticon.csv
100% (1600 of 1600) |####################| Elapsed Time: 0:07:46 Time: 0:07:46
| text | type | |
|---|---|---|
| 0 | b"@switchfoot http://twitpic.com/2y1zl - Awww,... | bad |
| 1 | b"is upset that he can't update his Facebook b... | bad |
| 2 | b'@Kenichan I dived many times for the ball. M... | bad |
| 3 | b'my whole body feels itchy and like its on fi... | bad |
| 4 | b"@nationwideclass no, it's not behaving at al... | bad |
print('Training dataset records', len(train_df.index))
train_df['type'].iplot(
kind='hist',
yTitle='count',
xTitle='Type',
title='Training data distribution'
)
Training dataset records 1600000